library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.2 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.2 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(cowplot)
##
## Attaching package: 'cowplot'
##
## The following object is masked from 'package:lubridate':
##
## stamp
theme_set(theme_bw(base_size = 12))
setwd("~/Documents/git/popgen-project/")
df <- read_tsv("ArchaicAdmixture/ArchaicSegments.txt") %>%
mutate(length = end-start+1000) %>%
filter(length>0)
## Rows: 524714 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (7): name, chrom, pop, country, region, outgroup, method
## dbl (8): start, end, length, snps, MeanProb, Shared_with_Altai, Shared_with_...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(df) %>% knitr::kable()
| name | chrom | start | end | length | snps | pop | country | region | MeanProb | Shared_with_Altai | Shared_with_Denisova | Shared_with_Vindija | outgroup | method |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| ERS699811 | 1 | 1562000 | 1603000 | 42000 | 5 | Toto | India | EastAsia | 0.5572001 | 0 | 0 | 0 | SubAfricans | HMM |
| ERS699811 | 1 | 2791000 | 2815000 | 25000 | 12 | Toto | India | EastAsia | 0.8304250 | 7 | 0 | 5 | SubAfricans | HMM |
| ERS699811 | 1 | 3058000 | 3113000 | 56000 | 36 | Toto | India | EastAsia | 0.9733212 | 10 | 6 | 12 | SubAfricans | HMM |
| ERS699811 | 1 | 3341000 | 3384000 | 44000 | 21 | Toto | India | EastAsia | 0.9102532 | 9 | 1 | 8 | SubAfricans | HMM |
| ERS699811 | 1 | 3433000 | 3453000 | 21000 | 13 | Toto | India | EastAsia | 0.9437119 | 8 | 0 | 7 | SubAfricans | HMM |
| ERS699811 | 1 | 3579000 | 3601000 | 23000 | 10 | Toto | India | EastAsia | 0.7721082 | 2 | 0 | 3 | SubAfricans | HMM |
summary(df)
## name chrom start end
## Length:524709 Length:524709 Min. : 3000 Min. : 47000
## Class :character Class :character 1st Qu.: 33423000 1st Qu.: 33502000
## Mode :character Mode :character Median : 71502000 Median : 71566000
## Mean : 80675269 Mean : 80761676
## 3rd Qu.:117715000 3rd Qu.:117799775
## Max. :249134000 Max. :249176000
## length snps pop country
## Min. : 1000 Min. : 0.00 Length:524709 Length:524709
## 1st Qu.: 29000 1st Qu.: 6.00 Class :character Class :character
## Median : 53204 Median : 11.00 Mode :character Mode :character
## Mean : 87407 Mean : 18.15
## 3rd Qu.: 104000 3rd Qu.: 22.00
## Max. :5932442 Max. :468.00
## region MeanProb Shared_with_Altai Shared_with_Denisova
## Length:524709 Min. :0.5000 Min. : 0.000 Min. : 0.000
## Class :character 1st Qu.:0.7398 1st Qu.: 0.000 1st Qu.: 0.000
## Mode :character Median :0.8686 Median : 1.000 Median : 0.000
## Mean :0.8281 Mean : 5.731 Mean : 2.941
## 3rd Qu.:0.9396 3rd Qu.: 7.000 3rd Qu.: 3.000
## Max. :1.0000 Max. :267.000 Max. :188.000
## Shared_with_Vindija outgroup method
## Min. : 0.000 Length:524709 Length:524709
## 1st Qu.: 0.000 Class :character Class :character
## Median : 1.000 Mode :character Mode :character
## Mean : 6.318
## 3rd Qu.: 8.000
## Max. :263.000
How many individuals do we have?
length(unique(df$name))
## [1] 358
How many populations?
length(unique(df$pop))
## [1] 110
What regions do we have?
unique(df$region)
## [1] "EastAsia" "WestEurasia" "CentralAsiaSiberia"
## [4] "SouthAsia" "Melanesia"
mean_seg_pop <- df %>%
group_by(pop, region) %>%
summarise(`Mean length` = mean(length))
## `summarise()` has grouped output by 'pop'. You can override using the `.groups`
## argument.
head(mean_seg_pop)
## # A tibble: 6 × 3
## # Groups: pop [6]
## pop region `Mean length`
## <chr> <chr> <dbl>
## 1 Abkhasian WestEurasia 73275.
## 2 Adygei WestEurasia 75431.
## 3 Albanian WestEurasia 78357.
## 4 Aleut CentralAsiaSiberia 81293.
## 5 Altaian CentralAsiaSiberia 83659.
## 6 Ami EastAsia 83183.
mean_seg_pop %>%
ungroup() %>%
arrange(region) %>%
mutate(pop = factor(pop, pop)) %>%
ggplot(aes(x = pop, y = `Mean length`, fill = region)) +
geom_bar(position="dodge", stat="identity") +
theme_bw() +
theme(axis.text.x = element_text(angle = 90, hjust = 1, size=9))
Who has the highest mean segment length?
mean_seg_pop[which.max(mean_seg_pop$`Mean length`),]
## # A tibble: 1 × 3
## # Groups: pop [1]
## pop region `Mean length`
## <chr> <chr> <dbl>
## 1 Papuans Melanesia 101734.
Average length of segments by region?
mean_seg_reg <- df %>%
group_by(region) %>%
summarise(`Mean length` = mean(length))
mean_seg_reg %>%
ggplot(aes(x = region, y = `Mean length`)) +
geom_col() +
coord_flip()
total_seg_ind <- df %>%
group_by(name, pop, region) %>%
summarise(total_length = sum(length))
## `summarise()` has grouped output by 'name', 'pop'. You can override using the
## `.groups` argument.
total_seg_ind %>%
arrange(total_length) %>%
ggplot(aes(x=total_length, fill=region)) +
geom_histogram(bins=100)
total_seg_ind %>%
group_by(region, pop) %>%
summarise(mean_total_length = mean(total_length)) %>%
ungroup() %>%
arrange(region) %>%
mutate(pop = factor(pop,pop)) %>%
ggplot(aes(x=pop, y=mean_total_length, fill=region)) +
geom_bar(position='dodge', stat='identity') +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
## `summarise()` has grouped output by 'region'. You can override using the
## `.groups` argument.
df %>%
pivot_longer(
cols = starts_with("Shared_with_"), # Pivot the shared columns
names_to = "shared_with", # Create a new column for group information
values_to = "count" # Create a new column for the y-values
) %>%
ggplot(aes(x=region, y=count, fill=shared_with)) +
geom_bar(stat='identity', position='dodge')

df %>%
ggplot(aes(length)) +
geom_histogram(bins=30) +
facet_wrap(~region, nrow=1) +
xlim(0,1e+6) +
theme_bw() +
theme(
axis.text.x = element_text(angle=90,hjust=1)
) +
NULL
## Warning: Removed 511 rows containing non-finite values (`stat_bin()`).
## Warning: Removed 10 rows containing missing values (`geom_bar()`).
df %>%
group_by(region) %>%
ggplot(aes(y=length,x=region)) +
theme_bw() +
geom_boxplot() +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
mean_seg_pop %>%
ungroup() %>%
arrange(region) %>%
mutate(pop = factor(pop, pop)) %>%
ggplot(aes(x = pop, y = `Mean length`, fill = region)) +
geom_bar(position="dodge", stat="identity") +
theme_bw() +
theme(axis.text.x = element_text(angle = 90, hjust = 1, size=9))
Differences in recombination rate, more than one wave of introgression in one of the regions. Recent gene flow/introgression from denisovans/neanderthal after migration out of Africa. Generation time.
snps <- read_tsv("ArchaicAdmixture/SNPs.txt")
## Rows: 11624912 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (5): name, chrom, region, archaic, Denis
## dbl (1): start
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(snps)
## # A tibble: 6 × 6
## name chrom start region archaic Denis
## <chr> <chr> <dbl> <chr> <chr> <chr>
## 1 ERS699811 1 1562536 EastAsia human _SubAfr
## 2 ERS699811 1 1563789 EastAsia human _SubAfr
## 3 ERS699811 1 1564194 EastAsia human _SubAfr
## 4 ERS699811 1 1601052 EastAsia human _SubAfr
## 5 ERS699811 1 1603103 EastAsia human _SubAfr
## 6 ERS699811 1 2792312 EastAsia AltaiNea_nofil _SubAfr
unique(snps$archaic)
## [1] "human" "AltaiNea_nofil" "Vi33.19_nofil" "Denisova_nofil"
unique(snps$Denis)
## [1] "_SubAfr"
nrow(snps)
## [1] 11624912
df <- df %>%
mutate(origin = case_when(
Shared_with_Altai > Shared_with_Denisova | Shared_with_Vindija > Shared_with_Denisova ~ "Neanderthal",
Shared_with_Altai < Shared_with_Denisova & Shared_with_Vindija < Shared_with_Denisova ~ "Denisovan",
Shared_with_Altai + Shared_with_Vindija + Shared_with_Denisova == 0 ~ "Unclassified"
))
df %>%
group_by(name, origin, pop) %>%
summarise(total = sum(length)) %>%
ungroup() %>%
group_by(pop, origin) %>%
summarise(mean_seqs = mean(total)) %>%
ungroup() %>%
mutate(origin = factor(origin, c("Denisovan", "Unclassified", "Neanderthal"))) %>%
ggplot(aes(x=pop, y=mean_seqs*10^-6, fill=origin)) +
geom_bar(stat='identity') +
coord_flip() +
ylab("Mean detected archaic sequence per region (Mb)") +
theme(axis.text.x = element_text(angle=90, hjust=1))
## `summarise()` has grouped output by 'name', 'origin'. You can override using
## the `.groups` argument.
## `summarise()` has grouped output by 'pop'. You can override using the `.groups`
## argument.
There are a lot more archaic proportions on Papuans, especially from Denisovans. Differences in proportions and lengths between methods.
df %>%
group_by(name, origin, region) %>%
summarise(total = sum(length)) %>%
ungroup() %>%
group_by(region, origin) %>%
summarise(mean_seqs = mean(total)) %>%
ungroup() %>%
mutate(origin = factor(origin, c("Denisovan", "Unclassified", "Neanderthal"))) %>%
ggplot(aes(x=region, y=mean_seqs*10^-6, fill=origin)) +
geom_bar(stat='identity') +
coord_flip() +
ylab("Mean detected archaic sequence per region (Mb)") +
theme(axis.text.x = element_text(angle=90, hjust=1))
## `summarise()` has grouped output by 'name', 'origin'. You can override using
## the `.groups` argument.
## `summarise()` has grouped output by 'region'. You can override using the
## `.groups` argument.
medians <- df %>%
filter(origin %in% c("Denisovan", "Neanderthal")) %>%
group_by(region, origin) %>%
summarise(median_length = (median(length)),
count=n(),
nsamples = length(unique(name)))
## `summarise()` has grouped output by 'region'. You can override using the
## `.groups` argument.
df %>% filter(origin %in% c("Denisovan", "Neanderthal")) %>%
ggplot(aes(x=length, fill=origin)) +
geom_density(alpha=0.2) +
#geom_vline(aes(xintercept = median(length), color = origin)) +
geom_vline(data=medians, aes(xintercept=median_length, color=origin)) +
facet_wrap(~region, nrow=1) +
scale_fill_manual(values = c("red", "blue")) +
xlim(0, 1e6) +
theme(axis.text.x = element_text(angle = 90, hjust=1))
## Warning: Removed 509 rows containing non-finite values (`stat_density()`).
medians %>% knitr::kable()
| region | origin | median_length | count | nsamples |
|---|---|---|---|---|
| CentralAsiaSiberia | Denisovan | 63000.0 | 1653 | 27 |
| CentralAsiaSiberia | Neanderthal | 74000.0 | 16219 | 27 |
| EastAsia | Denisovan | 60000.0 | 10907 | 132 |
| EastAsia | Neanderthal | 72000.0 | 85361 | 132 |
| Melanesia | Denisovan | 82000.0 | 73507 | 89 |
| Melanesia | Neanderthal | 76944.5 | 81514 | 89 |
| SouthAsia | Denisovan | 59000.0 | 2664 | 39 |
| SouthAsia | Neanderthal | 69000.0 | 21786 | 39 |
| WestEurasia | Denisovan | 51000.0 | 2369 | 71 |
| WestEurasia | Neanderthal | 69000.0 | 33729 | 71 |
quantile((df %>%
filter(origin %in% c("Neanderthal","Denisovan")))$length)
## 0% 25% 50% 75% 100%
## 1000 42000 74000 139000 4698985
quantile((df %>%
filter(origin %in% c("Neanderthal","Denisovan")) %>%
mutate(total_snps = Shared_with_Altai + Shared_with_Denisova + Shared_with_Vindija))$total_snps)
## 0% 25% 50% 75% 100%
## 1 6 14 29 561
# Use the quantiles (median) as the threshold
df %>%
filter(origin %in% c("Neanderthal","Denisovan")) %>%
mutate(total_snps = Shared_with_Altai + Shared_with_Denisova + Shared_with_Vindija) %>%
filter(total_snps > 19) %>%
ggplot() +
geom_density(aes(length, fill=origin), color=NA,alpha=0.5) +
theme_bw() +
facet_grid(~region) +
xlim(0,2000000) +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
## Warning: Removed 8 rows containing non-finite values (`stat_density()`).
df %>%
filter(origin %in% c("Neanderthal","Denisovan")) %>%
mutate(total_snps = Shared_with_Altai + Shared_with_Denisova + Shared_with_Vindija) %>%
filter(total_snps > 50) %>%
ggplot(aes(x=origin,y=length)) +
geom_boxplot() +
ggtitle('Denisovan fragment length per region') +
theme_bw() +
facet_grid(~region) +
scale_x_discrete() +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
df %>%
filter(origin == "Neanderthal") %>%
group_by(region) %>%
ggplot(aes(length)) +
geom_histogram(bins = 50) +
ggtitle('Neanderthal fragment length per region') +
theme_bw() + facet_grid(~region) +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) -> plot1
df %>%
filter(origin == "Denisovan") %>%
ggplot(aes(length)) +
geom_histogram(bins = 50) +
ggtitle('Denisovan fragment length per region') +
theme_bw() + facet_grid(~region) +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) -> plot2
df %>%
filter(origin == "Unclassified") %>%
group_by(region) %>%
ggplot(aes(length)) +
geom_histogram(bins = 50) +
ggtitle('Unclassified fragment length per region') +
theme_bw() + facet_grid(~region) +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) -> plot3
plot_grid(plot1, plot2, plot3, ncol=1)
df %>%
group_by(chrom, region) %>%
summarise(Frequency = sum(as.numeric(length))) %>%
ggplot(aes(x = chrom, y = Frequency, fill = region)) +
geom_bar(position = "dodge", stat="identity") +
theme_bw() +
scale_x_discrete(limits= c(seq(1, 22, 1),'X')) + # ordering from 1 to 22 +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
## `summarise()` has grouped output by 'chrom'. You can override using the
## `.groups` argument.
df %>%
group_by(chrom, origin) %>%
summarise(Frequency = sum(as.numeric(length))) %>%
ggplot(aes(x = chrom, y = Frequency, fill = origin)) +
geom_bar(position = "dodge", stat="identity") +
theme_bw() +
scale_x_discrete(limits= c(seq(1, 22, 1),'X')) + # ordering from 1 to 22
theme(axis.text.x = element_text(angle = 90, hjust = 1))
## `summarise()` has grouped output by 'chrom'. You can override using the
## `.groups` argument.
chrom <- factor(c(1:22, 'X'), levels=c(1:22,'X'))
chrom_len <- c(2.50e+08, 2.45e+08, 1.98e+08, 1.95e+08, 1.82e+08,
1.71e+08, 1.60e+08, 1.47e+08, 1.42e+08, 1.36e+08,
1.35e+08, 1.35e+08, 1.16e+08, 1.08e+08, 1.03e+08,
9.03e+07, 8.33e+07, 8.04e+07, 6.00e+07, 6.44e+07,
4.90e+07, 5.15e+07, 1.55e+08)
df %>%
mutate(chrom = factor(chrom, levels = c(1:22, 'X'))) %>%
arrange(region) %>%
ggplot() +
theme_bw() +
geom_segment(aes(x=start,xend=end,y=0, yend=1, col=origin), alpha=0.02) +
geom_blank() +
geom_rect(data=tibble(chrom, chrom_len), aes(xmin=chrom_len, xmax=Inf, ymin=0, ymax=1), fill="grey") +
scale_x_continuous(expand=c(0,0)) +
scale_y_continuous(expand=c(0,0)) +
facet_wrap(~chrom) +
theme(axis.text.x = element_text(angle = 90, hjust = 1),
axis.text.y=element_blank(),
axis.ticks=element_blank(),
axis.title.y=element_blank(),
legend.position="none") +
xlab("Position")
# Histogram
df %>%
group_by(chrom, region) %>%
filter(chrom %in% c('8', 'X')) %>%
summarise(Frequency = sum(as.numeric(length))) %>%
ggplot(aes(x = chrom, y=Frequency, fill = region)) +
geom_bar(position='dodge', stat='identity') +
theme_bw() +
theme(axis.text.x = element_text(angle=90, hjust=1))
## `summarise()` has grouped output by 'chrom'. You can override using the
## `.groups` argument.
# Boxplot
df %>%
group_by(chrom, region) %>%
filter(chrom %in% c('8', 'X')) %>%
summarise(Frequency = sum(as.numeric(length))) %>%
ggplot(aes(x = chrom, y = Frequency)) +
geom_boxplot() +
theme_bw() +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
## `summarise()` has grouped output by 'chrom'. You can override using the
## `.groups` argument.
Let us look at the EPAS1 as an example of adaptive introgression. This gene maps to chr2:46,520,806-46,613,842 in GRCh37 coordinates. Tibetan individuals have an increased amount of archaic polymorphism in this region.
df %>%
filter(origin %in% c("Neanderthal", "Denisovan")) %>%
filter(chrom == 2 & region == 'EastAsia' & start > 4.65e+07 & end < 4.67e+07) %>%
ggplot(aes(x=length, fill=origin)) +
geom_histogram(bins=50) +
facet_wrap(~country, nrow = 1) +
theme(axis.text.x = element_text(angle=90, hjust=1))
Contrary, FOXP2 is an example of maladaptive introgression, consistent with the presence of deserts of archaic origin in around this gene.
df %>%
filter(chrom == "7") %>%
arrange(region) %>%
ggplot() +
theme_bw() +
geom_segment(aes(x=start,xend=end,y=0, yend=1, col=origin), alpha=0.02) +
facet_wrap(~region, ncol=1) +
theme(axis.text.x = element_text(angle = 90, hjust = 1),
axis.text.y=element_blank(),
axis.ticks=element_blank(),
axis.title.y=element_blank(),
legend.position="none") +
geom_vline(xintercept = 113726365, alpha=0.5) +
xlab("Position") +
ggtitle(paste("Chromosome 7")) +
theme(axis.text.x = element_text(angle = 90, hjust = 1))